{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'0.22.2.post1'"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import sklearn\n",
    "sklearn.__version__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 重复性设置\n",
    "seed = 2020"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 基本使用\n",
    "## 参数不冲突时\n",
    "参数不冲突时，直接用一个字典传递参数和要对应的候选值给GridSearchCV即可  \n",
    "我这里的参数冲突指的是类似下面这种情况：  \n",
    "① 参数取值受限\n",
    "参数a='a'时，参数b只能取'b'  \n",
    "参数a='A'时，参数b能取'b'或'B'  \n",
    "② 参数互斥\n",
    "参数 a 或 b 二者只能选一个  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting 5 folds for each of 6 candidates, totalling 30 fits\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.\n",
      "[Parallel(n_jobs=2)]: Done  30 out of  30 | elapsed:    0.0s finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=5, error_score=nan,\n",
       "             estimator=SVC(C=1.0, break_ties=False, cache_size=200,\n",
       "                           class_weight=None, coef0=0.0,\n",
       "                           decision_function_shape='ovr', degree=3,\n",
       "                           gamma='scale', kernel='rbf', max_iter=-1,\n",
       "                           probability=False, random_state=2020, shrinking=True,\n",
       "                           tol=0.001, verbose=False),\n",
       "             iid='deprecated', n_jobs=2,\n",
       "             param_grid={'C': [0.1, 1, 10], 'kernel': ['rbf', 'linear']},\n",
       "             pre_dispatch='2*n_jobs', refit='f1_mi', return_train_score=False,\n",
       "             scoring={'acc': 'accuracy', 'f1_mi': 'f1_micro',\n",
       "                      'loss': make_scorer(custom_loss_func, greater_is_better=False)},\n",
       "             verbose=1)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn import datasets\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "iris = datasets.load_iris()\n",
    "model = SVC(random_state=seed)\n",
    "\n",
    "# 需调参数及候选值\n",
    "parameters = {\n",
    "    'C': [0.1, 1, 10], \n",
    "    'kernel': ['rbf', 'linear']\n",
    "}\n",
    "\n",
    "# 评价依据\n",
    "## 可用评价指标\n",
    "## https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter\n",
    "## 使用自定义评价指标\n",
    "from sklearn.metrics import make_scorer\n",
    "def custom_loss_func(y_true, y_pred):\n",
    "    return len(y_true[y_true!=y_pred])/len(y_true)\n",
    "# greater_is_better=False，指标越小越好\n",
    "# needs_proba=False，指标通过标签计算，不是通过概率\n",
    "loss_socre = make_scorer(custom_loss_func, greater_is_better=False, needs_proba=False)\n",
    "scores = {\n",
    "    'acc': 'accuracy',         # 准确率\n",
    "    'f1_mi': 'f1_micro',       # 一种多分类f1值\n",
    "    'loss': loss_socre         # 自定义评价指标\n",
    "}\n",
    "\n",
    "# 网格搜索实例\n",
    "gs = GridSearchCV(\n",
    "    model,\n",
    "    parameters,\n",
    "    cv=5,                      # 交叉验证数\n",
    "    scoring=scores,            # 评价指标\n",
    "    refit='f1_mi',             # 在此指标下，用最优表现的参数重新训练模型\n",
    "#     return_train_score=True,   # gs.cv_results_额外保存训练集的评价结果\n",
    "    verbose=1,                 # 日志信息，默认0不输出\n",
    "    n_jobs=2                   # 并行加速\n",
    ")\n",
    "\n",
    "# 一共要跑的任务数=参数1候选值*...*参数i候选值*交叉验证数\n",
    "# 这里就是3*2*5=30\n",
    "gs.fit(iris.data, iris.target)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "最优参数\n",
      "{'C': 1, 'kernel': 'linear'}\n",
      "最佳模型的评分\n",
      "0.9800000000000001\n",
      "最优模型\n"
     ]
    }
   ],
   "source": [
    "print(\"最优参数\")\n",
    "print(gs.best_params_)\n",
    "print(\"最佳模型的评分\")\n",
    "print(gs.best_score_)\n",
    "print(\"最优模型\")\n",
    "best_model = gs.best_estimator_  # GridSearchCV的refit参数不能为False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_C', 'param_kernel', 'params', 'split0_test_acc', 'split1_test_acc', 'split2_test_acc', 'split3_test_acc', 'split4_test_acc', 'mean_test_acc', 'std_test_acc', 'rank_test_acc', 'split0_test_f1_mi', 'split1_test_f1_mi', 'split2_test_f1_mi', 'split3_test_f1_mi', 'split4_test_f1_mi', 'mean_test_f1_mi', 'std_test_f1_mi', 'rank_test_f1_mi', 'split0_test_loss', 'split1_test_loss', 'split2_test_loss', 'split3_test_loss', 'split4_test_loss', 'mean_test_loss', 'std_test_loss', 'rank_test_loss'])"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gs.cv_results_.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>mean_test_acc</th>\n",
       "      <th>mean_test_f1_mi</th>\n",
       "      <th>mean_test_loss</th>\n",
       "      <th>params</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.980000</td>\n",
       "      <td>0.980000</td>\n",
       "      <td>-0.020000</td>\n",
       "      <td>{'C': 1, 'kernel': 'linear'}</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.980000</td>\n",
       "      <td>0.980000</td>\n",
       "      <td>-0.020000</td>\n",
       "      <td>{'C': 10, 'kernel': 'rbf'}</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.973333</td>\n",
       "      <td>0.973333</td>\n",
       "      <td>-0.026667</td>\n",
       "      <td>{'C': 0.1, 'kernel': 'linear'}</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   mean_test_acc  mean_test_f1_mi  mean_test_loss  \\\n",
       "3       0.980000         0.980000       -0.020000   \n",
       "4       0.980000         0.980000       -0.020000   \n",
       "1       0.973333         0.973333       -0.026667   \n",
       "\n",
       "                           params  \n",
       "3    {'C': 1, 'kernel': 'linear'}  \n",
       "4      {'C': 10, 'kernel': 'rbf'}  \n",
       "1  {'C': 0.1, 'kernel': 'linear'}  "
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"\"\"\n",
    "用表格查看训练信息\n",
    "\"\"\"\n",
    "cv_results = pd.DataFrame(gs.cv_results_)\n",
    "# 查看其他指标的结果和参数，比如这里按平均准确率排序\n",
    "cv_results = cv_results.sort_values(by=\"mean_test_acc\", ascending=False)\n",
    "shown_columns = [\"mean_test_\"+col for col in scores.keys()] + [\"params\"]\n",
    "cv_results[shown_columns].head(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 参数冲突时"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "参数冲突时，互斥参数搜索空间用不同字典来描述  \n",
    "将这些字典放到列表中再传递给GridSearchCV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting 5 folds for each of 15 candidates, totalling 75 fits\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.\n",
      "[Parallel(n_jobs=2)]: Done  75 out of  75 | elapsed:    0.0s finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=5, error_score=nan,\n",
       "             estimator=SVC(C=1.0, break_ties=False, cache_size=200,\n",
       "                           class_weight=None, coef0=0.0,\n",
       "                           decision_function_shape='ovr', degree=3,\n",
       "                           gamma='scale', kernel='rbf', max_iter=-1,\n",
       "                           probability=False, random_state=2020, shrinking=True,\n",
       "                           tol=0.001, verbose=False),\n",
       "             iid='deprecated', n_jobs=2,\n",
       "             param_grid=[{'C': [0.1, 1, 10], 'kernel': ['rbf', 'linear']},\n",
       "                         {'C': [0.1, 1, 10], 'degree': [1, 3, 5],\n",
       "                          'kernel': ['poly']}],\n",
       "             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n",
       "             scoring='accuracy', verbose=1)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn import datasets\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "iris = datasets.load_iris()\n",
    "model = SVC(random_state=seed)\n",
    "\n",
    "parameters = [\n",
    "    {\n",
    "        'C': [0.1, 1, 10], \n",
    "        'kernel': ['rbf', 'linear']\n",
    "    },\n",
    "    {\n",
    "        'C': [0.1, 1, 10],\n",
    "        'kernel': ['poly'],\n",
    "        'degree': [1, 3, 5]\n",
    "    }\n",
    "]\n",
    "\n",
    "gs = GridSearchCV(\n",
    "    model,\n",
    "    parameters,\n",
    "    cv=5,\n",
    "    scoring='accuracy',\n",
    "    verbose=1,\n",
    "    n_jobs=2,\n",
    ")\n",
    "\n",
    "gs.fit(iris.data, iris.target)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "最优参数\n",
      "{'C': 0.1, 'degree': 3, 'kernel': 'poly'}\n",
      "最佳模型的评分\n",
      "0.9866666666666667\n",
      "最优模型\n"
     ]
    }
   ],
   "source": [
    "print(\"最优参数\")\n",
    "print(gs.best_params_)\n",
    "print(\"最佳模型的评分\")\n",
    "print(gs.best_score_)\n",
    "print(\"最优模型\")\n",
    "best_model = gs.best_estimator_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 特征选择+模型复合调参"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "管道可以用来连接多个操作，比如特征选择+模型训练，数据处理+模型训练等等    \n",
    "如果这些操作也有参数可调，可以用 GridSearchCV 对它们一起调参"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting 5 folds for each of 90 candidates, totalling 450 fits\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.\n",
      "[Parallel(n_jobs=2)]: Done 440 tasks      | elapsed:    0.9s\n",
      "[Parallel(n_jobs=2)]: Done 447 out of 450 | elapsed:    0.9s remaining:    0.0s\n",
      "[Parallel(n_jobs=2)]: Done 450 out of 450 | elapsed:    0.9s finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=5, error_score=nan,\n",
       "             estimator=Pipeline(memory=None,\n",
       "                                steps=[('selector',\n",
       "                                        SelectKBest(k=10,\n",
       "                                                    score_func=<function f_classif at 0x0000028493B1A5E8>)),\n",
       "                                       ('model',\n",
       "                                        SVC(C=1.0, break_ties=False,\n",
       "                                            cache_size=200, class_weight=None,\n",
       "                                            coef0=0.0,\n",
       "                                            decision_function_shape='ovr',\n",
       "                                            degree=3, gamma='scale',\n",
       "                                            kernel='rbf', max_iter=-1,\n",
       "                                            probability=False,\n",
       "                                            random_state=2020, shr...\n",
       "                          'selector__score_func': [<function chi2 at 0x0000028493B1A168>,\n",
       "                                                   <function f_classif at 0x0000028493B1A5E8>]},\n",
       "                         {'model__C': [0.1, 1, 10], 'model__degree': [1, 3, 5],\n",
       "                          'model__kernel': ['poly'], 'selector__k': [2, 3, 4],\n",
       "                          'selector__score_func': [<function chi2 at 0x0000028493B1A168>,\n",
       "                                                   <function f_classif at 0x0000028493B1A5E8>]}],\n",
       "             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n",
       "             scoring='accuracy', verbose=1)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn import datasets\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.feature_selection import SelectKBest, chi2, f_classif\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "iris = datasets.load_iris()\n",
    "\n",
    "pipe = Pipeline([\n",
    "    ('selector', SelectKBest()),       # 特征选择\n",
    "    ('model', SVC(random_state=seed))  # 模型\n",
    "])\n",
    "\n",
    "# “双下划线”指定要调整的部件及其参数\n",
    "parameters = [\n",
    "    {\n",
    "        'selector__score_func': [chi2, f_classif],\n",
    "        'selector__k': [2, 3, 4],\n",
    "        'model__C': [0.1, 1, 10], \n",
    "        'model__kernel': ['rbf', 'linear']\n",
    "    },\n",
    "    {\n",
    "        'selector__score_func': [chi2, f_classif],\n",
    "        'selector__k': [2, 3, 4],\n",
    "        'model__C': [0.1, 1, 10],\n",
    "        'model__kernel': ['poly'],\n",
    "        'model__degree': [1, 3, 5]\n",
    "    }\n",
    "]\n",
    "\n",
    "\n",
    "gs = GridSearchCV(\n",
    "    pipe,\n",
    "    parameters,\n",
    "    cv=5,\n",
    "    scoring='accuracy',\n",
    "    verbose=1,\n",
    "    n_jobs=2,\n",
    ")\n",
    "\n",
    "gs.fit(iris.data, iris.target)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "最优参数\n",
      "{'model__C': 0.1, 'model__degree': 3, 'model__kernel': 'poly', 'selector__k': 4, 'selector__score_func': <function chi2 at 0x0000028493B1A168>}\n",
      "最佳模型的评分\n",
      "0.9866666666666667\n",
      "最优组合\n"
     ]
    }
   ],
   "source": [
    "print(\"最优参数\")\n",
    "print(gs.best_params_)\n",
    "print(\"最佳模型的评分\")\n",
    "print(gs.best_score_)\n",
    "print(\"最优组合\")\n",
    "# best_pipe = gs.best_estimator_\n",
    "best_selector = gs.best_estimator_[0]\n",
    "best_model = gs.best_estimator_[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SVC(C=0.1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,\n",
       "    decision_function_shape='ovr', degree=3, gamma='scale', kernel='poly',\n",
       "    max_iter=-1, probability=False, random_state=2020, shrinking=True,\n",
       "    tol=0.001, verbose=False)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "best_model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "246.6px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
